Summary of Test data
summary(test)
## INCIDENT_ID DATE X_1 X_2
## Length:15903 Length:15903 Min. :0.0000 Min. : 0.00
## Class :character Class :character 1st Qu.:0.0000 1st Qu.: 7.00
## Mode :character Mode :character Median :0.0000 Median :24.00
## Mean :0.4681 Mean :24.72
## 3rd Qu.:0.0000 3rd Qu.:36.00
## Max. :7.0000 Max. :52.00
##
## X_3 X_4 X_5 X_6
## Min. : 0.00 Min. : 0.000 Min. :0.000 Min. : 1.000
## 1st Qu.: 8.00 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 3.000
## Median :24.00 Median : 4.000 Median :3.000 Median : 5.000
## Mean :24.58 Mean : 4.284 Mean :2.448 Mean : 6.085
## 3rd Qu.:35.00 3rd Qu.: 6.000 3rd Qu.:5.000 3rd Qu.: 8.000
## Max. :52.00 Max. :10.000 Max. :5.000 Max. :19.000
##
## X_7 X_8 X_9 X_10
## Min. : 0.000 Min. : 0.0000 Min. :0.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 0.0000 1st Qu.:5.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.0000 Median :5.000 Median : 1.000
## Mean : 4.863 Mean : 0.9867 Mean :4.909 Mean : 1.241
## 3rd Qu.: 7.000 3rd Qu.: 1.0000 3rd Qu.:6.000 3rd Qu.: 1.000
## Max. :18.000 Max. :50.0000 Max. :6.000 Max. :40.000
##
## X_11 X_12 X_13 X_14
## Min. : 0 Min. : 0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.:174 1st Qu.: 1.0000 1st Qu.: 72.00 1st Qu.: 29.00
## Median :249 Median : 1.0000 Median : 98.00 Median : 62.00
## Mean :207 Mean : 0.9722 Mean : 85.19 Mean : 72.22
## 3rd Qu.:249 3rd Qu.: 1.0000 3rd Qu.:103.00 3rd Qu.:107.00
## Max. :332 Max. :40.0000 Max. :117.00 Max. :142.00
## NA's :127
## X_15
## Min. : 0.00
## 1st Qu.:34.00
## Median :34.00
## Mean :33.42
## 3rd Qu.:34.00
## Max. :50.00
##
The following plot shows Number hack attacks on digital payments from the year 1991 to 2018.
train %>% group_by(MULTIPLE_OFFENSE) %>% summarise(counts = n()) %>% plot_ly(
labels = ~ MULTIPLE_OFFENSE,
values = ~ counts,
type = 'pie'
) %>% layout(legend = list(
orientation = "h",
xanchor = "center",
x = 0.5
))
## `summarise()` ungrouping output (override with `.groups` argument)
Let visualize the train & test data columns from X_1 to X_15 and see the data distribution of them
Train data
ggplotly(
train %>% select(-MULTIPLE_OFFENSE) %>%
gather(measurement, value, X_1:X_15, factor_key = T) %>%
ggplot(aes(x = value, fill = measurement)) + geom_histogram() +
facet_wrap(~ measurement , scales = "free" , ncol = 3) + theme_light() +
theme(
legend.position = "none",
panel.spacing = unit(2, "lines"),
axis.title.x = element_blank(),
axis.title.y = element_blank()
)
)
Test data
ggplotly(
test %>%
gather(measurement, value, X_1:X_15, factor_key = T) %>%
ggplot(aes(x = value, fill = measurement)) + geom_histogram() +
facet_wrap(~ measurement, scales = "free", ncol = 3) + theme_light() +
theme(
legend.position = "none",
panel.spacing = unit(2, "lines"),
axis.title.x = element_blank(),
axis.title.y = element_blank()
)
)
Next extract the features like day, month, year, week ,weekday,quarter and leap year from date column
{
fulldata$date_day <- lubridate::day(fulldata$DATE)#extract day
fulldata <- fulldata %>% select(1:2, 19, everything())#column reorder
fulldata$date_month <- lubridate::month(fulldata$DATE, label=T)#extract month
fulldata <- fulldata %>% select(1:3, 20, everything())#column reorder
fulldata$date_year <- lubridate::year(fulldata$DATE)#extract year
fulldata <- fulldata %>% select(1:4, 21, everything())#column reorder
fulldata <- fulldata %>% group_by(DATE) %>% mutate(date_count = n())#Date count
fulldata <- fulldata %>% select(1:2, 22, everything())#column reorder
fulldata$date_quarter <- lubridate::quarter(fulldata$DATE)#extract quarter
fulldata <- fulldata %>% select(1:6, 23, everything())#column reorder
fulldata$date_week <- lubridate::week(fulldata$DATE)#extract week
fulldata <- fulldata %>% select(1:7, 24, everything())#column reorder
fulldata$date_wday <- lubridate::wday(fulldata$DATE , label=T )#extract weekday
fulldata <- fulldata %>% select(1:8, 25, everything())#column reorder
fulldata$date_leap_year <- lubridate::leap_year(fulldata$DATE)#extract leap year info
fulldata <- fulldata %>% select(1:9, 26, everything())#column reorder
}
datatable(head(fulldata[,c(2:10)]), class = 'cell-border stripe')
The following time series chart explains how the number of hack attacks on digital payments changed over the time.
#filtering train data from fulldata
highchart() %>%
hc_add_series(
data =
fulldata %>% filter(!is.na(MULTIPLE_OFFENSE)) %>%
group_by(date_year, MULTIPLE_OFFENSE) %>%
summarise(counts = n()) ,
type = "line",
hcaes(x = date_year,
y = counts,
group = MULTIPLE_OFFENSE)
)
lets break down the time series plot into various levels and see how the number of hack attacks on digital payments changed in weeks, months, quarters and leap years from the year 1991 to 2018.
The following visualization explains the number hack attacks in each month over the year
ggplotly(
fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
filter(MULTIPLE_OFFENSE == 'Yes') %>%
group_by(date_month, date_year, MULTIPLE_OFFENSE) %>%
summarise(counts = n()) %>%
ggplot(
aes(
x = reorder_within(date_month,-counts,
date_year),
y = counts,
fill = date_month,
text = paste("Month:",
date_month,
"<br> counts:",
counts)
)
) +
geom_bar(stat = "identity",
position = "stack") + scale_fill_brewer(palette = "Set3") + xlab("Month") +
facet_wrap( ~ date_year , scales = "free" , ncol = 3) +
scale_x_reordered() +
theme(
panel.spacing = unit(1, "lines"),
axis.text.x = element_blank(),
axis.title.x = element_text(face = "bold", hjust = 0.5),
axis.title.y = element_blank()
) ,
tooltip = "text"
) %>%
layout(legend = list(
orientation = "h",
x = 0,
y = -0.05
))
The following visualization explains the number hack attacks in each quarter over the year.In the 2nd & 3rd quarter, more number of hack attacks occurred
ggplotly(
fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
filter(MULTIPLE_OFFENSE == 'Yes') %>%
dplyr::group_by(date_quarter, date_year, MULTIPLE_OFFENSE) %>%
summarise(counts = n()) %>%
ggplot(aes(
x = reorder_within(date_quarter,-counts,
date_year),
y = counts,
fill = factor(date_quarter),
text = paste("Quarter:",
date_quarter,
"<br> counts:",
counts)
)) +
geom_bar(stat = "identity",
position = "stack") + scale_fill_brewer(palette = "Accent") + xlab("Quarter") +
facet_wrap( ~ date_year , scales = "free" , ncol = 3) +
scale_x_reordered() +
theme(
panel.spacing = unit(1, "lines"),
axis.text.x = element_text(fac = "bold"),
axis.title.x = element_text(face = "bold", hjust = 0.5),
axis.title.y = element_blank()
),
tooltip = "text"
) %>%
layout(legend = list(
orientation = "h",
x = 0.3,
y = -0.05
))
The following plot explains number hack attacks in each day of the week over the year, the most hack attacks oocured in week days.
ggplotly(
fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
filter(MULTIPLE_OFFENSE == 'Yes') %>%
group_by(date_wday, date_year, MULTIPLE_OFFENSE) %>%
summarise(counts = n()) %>%
ggplot(aes(
x = reorder_within(date_wday,-counts,
date_year),
y = counts,
fill = date_wday,
text = paste("Week Day:",
date_wday,
"<br> counts:",
counts)
)) +
geom_bar(stat = "identity",
position = "stack") + scale_fill_brewer(palette = "Set2") + xlab("Week Day") +
facet_wrap( ~ date_year , scales = "free" , ncol = 3) +
scale_x_reordered() +
theme(
panel.spacing = unit(1, "lines"),
axis.text.x = element_blank(),
axis.title.x = element_text(face = "bold", hjust = 0.5),
axis.title.y = element_blank()
),
tooltip = "text"
) %>%
layout(legend = list(
orientation = "h",
x = 0.2,
y = -0.05
))
Ordinary Year - Leap Year hack attacks
fulldata %>% drop_na(MULTIPLE_OFFENSE) %>%
filter(MULTIPLE_OFFENSE == 'Yes') %>%
group_by(date_leap_year, date_year) %>%
summarise(counts = n()) %>%
ggplot(aes(
x = date_year,
y = counts,
label = date_year,
color = date_leap_year
)) +
geom_point() +
geom_text(aes(color = factor(date_leap_year))) +
geom_line(aes(
x = date_year,
y = counts,
color = date_leap_year,
group = 1
),
size = 1) +
xlab("Year") +
scale_x_continuous(breaks = seq(1991, 2018, by = 1)) +
labs(color = "Leap Year\n") +
theme(
legend.position = "bottom",
axis.text.x = element_blank(),
axis.title.y = element_blank()
)

The X_1 to X_15 features are anonymous so not able to identify which one is quantitative and qualitative feature.Lets assume the features X_1 to X_15 as continuous type and see the correlation of those features.
In the above corrrelation plot blue color represents positive correlation and red color represents negative correlation ,the color intensity explains how much the features are correlated from -1 to +1.(0 means no correlation,1 means positive correlation, -1 means negative correlation).
There is a strong positive correlation between X_2 and X_3 , X_10 and X_12.
lets visualize the Missing values from train and test data
Using missRanger package to impute missing values.
fulldataimp<-missRanger(fulldata[,-c(1,2,26)], num.trees = 100,verbose = T,
seed = 887,pmm.k = 4)
##
## Missing value imputation by random forests
##
## Variables to impute: X_12
## Variables used to impute: date_count, date_day, date_month, date_year, date_quarter, date_week, date_wday, date_leap_year, X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14, X_15
## iter 1: .
## iter 2: .
## iter 3: .
insert incident id to new data frame
fulldataimp$INCIDENT_ID<-fulldata$INCIDENT_ID
fulldataimp<-fulldataimp %>% select(24,everything())
insert MULTIPLE_OFFENSE to new data frame
fulldataimp$MULTIPLE_OFFENSE<-fulldata$MULTIPLE_OFFENSE
Lets recode the MULTIPLE_OFFENSE by 1 and 0
fulldataimp$MULTIPLE_OFFENSE<-as.factor(ifelse(fulldataimp$MULTIPLE_OFFENSE=='Yes',1,0))
lets split the data into Train set, validation set, Test set
Train set
trainn<-fulldataimp[complete.cases(fulldataimp$MULTIPLE_OFFENSE),]
Test set
testn<-fulldataimp[!complete.cases(fulldataimp$MULTIPLE_OFFENSE),]
Create a stratified sample train and validation set data from trainn dataframe
index<-createDataPartition(y=trainn$MULTIPLE_OFFENSE ,p=0.75,list = F)
trains<-trainn[index,]
vals<-trainn[-index,]
create a confusion matrix and see how its predicted the actual classes
cm<-confusionMatrix(valpre, factor(vals$MULTIPLE_OFFENSE),mode="everything")
Validation set accuaracy
cm$byClass[7]
## F1
## 0.9608939